#!/usr/bin/env ruby

require_relative 'hyph-utf8'

$path_root=File.expand_path("../../../..", __FILE__)
$encoding_data_dir = File.expand_path("../data/encodings", __FILE__)
$output_data_dir = "#{$path_root}/tex/generic/hyph-utf8/conversions"

def output_file_name(encoding)
	File.join($output_data_dir, sprintf('conv-utf8-%s.tex', encoding))
end

$header = <<__EOHEADER__
%% conv-utf8-%s.tex
%%
%% Conversion from UTF-8 to %s,
%% used before loading hyphenation patterns for 8-bit TeX engines.
%%
%% This file is part of hyph-utf8 package and autogenerated.
%% See http://tug.org/tex-hyphen
%%
%% Copyright 2008-%d TeX Users Group.
%% You may freely use, modify and/or distribute this file.
%% (But consider adapting the scripts if you need modifications.)
__EOHEADER__

def output_copyright_notice(outfile, encoding)
	outfile.printf $header, encoding, encoding.upcase, Time.new.year
end

$uniconvmacro1 = <<__EOUNIMAC1__
% macros adapted from ConTeXt MKII; see unic-ini.mkii
\\def\\unicodechar#1{%
	\\ifcsname unichar@\\number#1\\endcsname
		\\csname unichar@\\number#1\\endcsname
	\\else
		\\errmessage{Unicode character [#1] not in encoding.}%
	\\fi}
__EOUNIMAC1__

$uniconvmacros = [nil, nil]

$uniconvmacros << <<__EOTWOBYTES__
\\def\\utftwouniglyph#1#2%
	{\\expandafter\\unicodechar\\expandafter
		{\\the\\numexpr64*(#1-192)+`#2-128\\relax}}
__EOTWOBYTES__

$uniconvmacros << <<__EOTHREEBYTES__
\\def\\utfthreeuniglyph#1#2#3%
	{\\expandafter\\unicodechar\\expandafter
		{\\the\\numexpr4096*(#1-224)+64*(`#2-128)+`#3-128\\relax}}
__EOTHREEBYTES__

$uniconvmacros << <<__EOFOURBYTES__
\\def\\utffouruniglyph#1#2#3#4%
	{\\expandafter\\unicodechar\\expandafter
		{\\the\\numexpr262144*(#1-240)+4096*(`#2-128)+64*(`#3-128)+`#4-128\\relax}}
__EOFOURBYTES__

$uniconvmacro2 = <<__EOUNIMAC2__

\\def\\addunichar #1 #2 {\\expandafter\\def\\csname unichar@\\number#1\\endcsname{#2}}

% \\addunichar "unicode_code - ^^font_encoding_code
__EOUNIMAC2__

["t8m", "lth"].each do |encoding|
	# load encoding
	e = HyphEncoding.new(encoding)

	# open file
	File.open(output_file_name(encoding), "w") do |file_out|

		# copyright notice
		output_copyright_notice(file_out, encoding)
		file_out.puts

		# macro to get mapping unicode -> font encoding & error message if screwed up
		file_out.puts $uniconvmacro1

		# minimal and maximal length of characters in the encoding (until now just 2 & 3)
		unicode_characters_array = e.unicode_characters.sort
		length_min = unicode_characters_array.first[1].bytes.size
		length_max = unicode_characters_array.last[1].bytes.size

		# only output the necessary macros for transforming UTF-8 -> Unicode number
		(length_min..length_max).each do |nbytes|
			file_out.puts $uniconvmacros[nbytes]
		end

		# macro to store mapping unicode -> font encoding
		file_out.puts $uniconvmacro2

		# all unicode characters in the encoding
		e.unicode_characters.sort.each do |code,c|
			file_out.printf("\\addunichar \"%04X ^^%02x \\lccode\"%02X=\"%02X %% %s - %s\n",
				c.code_uni, c.code_enc, c.code_enc, c.code_enc, [c.code_uni].pack('U'), c.name)
		end
		file_out.puts

		# make all the possible first characters active
		# output the definition into file
		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
			byte = first_byte_code.hex
			size = chars[0].bytes.size
			# 2-byte: 0b11000000 <= byte < 0b11100000
			str = case size
			when 2 then
				"two"
			# 3-byte: 0b11100000 <= byte < 0b11110000
			when 3 then
				"three"
			# 4-byte: 0b11110000 <= byte < 0b11111000
			when 4 then
				"four"
			end
			file_out.printf("\\catcode\"%02X=\\active \\def^^%02x{\\utf%suniglyph{\"%02X}}\n", byte, byte, str, byte)
		end
	end
end

["ec", "qx", "t2a", "lmc", "il2", "il3", "l7x"].each do |encoding|
	# load encoding
	e = HyphEncoding.new(encoding)

	# open file
	File.open(output_file_name(encoding), "w") do |file_out|

		# copyright notice
		output_copyright_notice(file_out, encoding)
		file_out.puts '%'

		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
			# sorting all the second characters alphabetically
			chars.sort!{|x,y| x.code_uni <=> y.code_uni }
			# make all the possible first characters active
			# output the definition into file
			file_out.printf("\\catcode\"%02X=\\active\n", first_byte_code.hex)
		end
		file_out.puts "%"
		e.unicode_characters_first_byte.sort.each do |first_byte_code,chars|
			first_byte_code = first_byte_code.hex
			size = chars[0].bytes.size
			if size != 2 then
				throw "The encoding #{encoding} uses more than two bytes to encode characters"
			else

				file_out.printf("\\def^^%02x#1{%%\n", first_byte_code)
				string_fi = ""
				for i in 1..(chars.size)
					uni_character = chars[i-1]
					enc_byte    = uni_character.code_enc
					enc_byte    = [ uni_character.code_enc ].pack('c').unpack('H2')
					file_out.printf("\t\\ifx#1^^%02x^^%02x\\else %% %s - U+%04X - %s\n", uni_character.bytes[1], uni_character.code_enc, [uni_character.code_uni].pack('U'), uni_character.code_uni, uni_character.name)
					string_fi = string_fi + "\\fi"
				end

			# at least three bytes
			end
			file_out.puts "\t\\errmessage{Hyphenation pattern file corrupted or #{encoding} encoding not supported!}"
			file_out.puts string_fi + "}"
		end
		file_out.puts '%'
		file_out.puts '% ensure all the chars above have valid \lccode values'
		file_out.puts '%'
		e.lowercase_characters.each do |character|
			code = [ character.code_enc ].pack("c").unpack("H2").first.upcase
			# \lccode"FF="FF
			file_out.printf "\\lccode\"%s=\"%s %% %s - U+%04X - %s\n", code, code, [character.code_uni].pack('U'), character.code_uni, character.name
		end

		file_out.puts
	end
end
